Prepare TESLA dataset
- TESLA dataset comes with nM binding affinity from netMHCpan 4.0 and hrs stability from netMHCstabpan.
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
TESLA= fread("TESLA_DATASET_608.csv")
TESLA=TESLA %>% separate(col=PMHC, into = c("HLA_Allele","Peptide"),sep="_")%>% mutate(HLA_Allele = paste0("HLA-",HLA_Allele))%>%
mutate(Length = nchar(Peptide))%>% mutate(VALIDATED = ifelse(VALIDATED==FALSE,"Negative","Positive"))%>% dplyr::rename(Immunogenicity=VALIDATED)
TESLA=TESLA %>% mutate(nM = NETMHC_PAN_BINDING_AFFINITY) %>% mutate("Thalf(h)" = BINDING_STABILITY) %>% mutate(HydroFraction = FRAC_HYDROPHOBIC)
TESLA %>% glimpse()
## Rows: 608
## Columns: 26
## $ HLA_Allele <chr> "HLA-A*01:01", "HLA-A*01:01", "HLA-A*01:01…
## $ Peptide <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", …
## $ PATIENT_ID <int> 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, …
## $ TISSUE_TYPE <chr> "PBMC", "PBMC", "PBMC", "PBMC", "PBMC", "P…
## $ MHC <chr> "A*01:01", "A*01:01", "A*01:01", "A*01:01"…
## $ ALT_EPI_SEQ <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", …
## $ PEP_LEN <int> 9, 10, 11, 9, 9, 10, 9, 9, 8, 9, 8, 9, 10,…
## $ MEASURED_BINDING_AFFINITY <dbl> 45, 33, 77, 0, 3, 1320, NA, 354, 30, 9, NA…
## $ NETMHC_PAN_BINDING_AFFINITY <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.…
## $ TUMOR_ABUNDANCE <dbl> 57.819051, 51.857250, 30.389590, 159.96769…
## $ BINDING_STABILITY <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, …
## $ FRAC_HYDROPHOBIC <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667…
## $ AGRETOPICITY <dbl> 0.019019295, 0.838764951, 0.010851108, 1.2…
## $ FOREIGNNESS <dbl> 0.000000e+00, 0.000000e+00, 9.280000e-20, …
## $ MUTATION_POSITION <int> 2, 3, 11, 5, 8, 10, 9, 8, 7, 1, 2, 1, 1, 7…
## $ NUMBER_PREDICTING <int> 11, 10, 8, 13, 13, 8, 5, 12, 6, 8, 6, 16, …
## $ Immunogenicity <chr> "Negative", "Negative", "Negative", "Negat…
## $ TCR_FLOW_I <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TCR_FLOW_I_QUANT <dbl> 0.0000, 0.0000, 0.0011, 0.0000, 0.0000, 0.…
## $ TCR_NANOPARTICLE <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ TCR_FLOW_II <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TCR_FLOW_II_QUANT <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, …
## $ Length <int> 9, 10, 11, 9, 9, 10, 9, 9, 8, 9, 8, 9, 10,…
## $ nM <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.…
## $ `Thalf(h)` <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, …
## $ HydroFraction <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667…
Generate TESLA data binding affinity rank scores
- Use netMHCpan to get the rank score.
TESLA$HLA_Allele = gsub(x=TESLA$HLA_Allele,pattern="\\*",replacement = "")
TEST_DATA_LOCATION = "TESLA_NETMHC/"
for(allele_i in 1:length(unique(TESLA$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(TESLA$HLA_Allele)[allele_i],pattern="\\:|\\*",replacement = "")
LENGTHS=TESLA %>% filter(HLA_Allele %in% unique(TESLA$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(TESLA %>% filter(HLA_Allele %in% unique(TESLA$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",unique(TESLA$HLA_Allele)[allele_i]," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}
Read in binding affinity rank scores
# Read from folder.
TEST_DATA_LOCATION = "TESLA_NETMHC/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# map HLA allele nomenclature
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(TESLA$HLA_Allele))
# Confirm 608 rows before and after joining with full TESLA data
Netmhcpanres %>% nrow
## [1] 608
TESLA %>% nrow
## [1] 608
# Join dataset
TESLA=TESLA %>% inner_join(Netmhcpanres %>% select(Peptide, HLA_Allele, Rank) %>% distinct())
## Joining, by = c("HLA_Allele", "Peptide")
TESLA %>% nrow
## [1] 608
Run netMHCstabpan to generate binding stability rank scores
dir.create("TESLA_STABPAN")
TEST_DATA_LOCATION = "TESLA_STABPAN/"
TESLA$HLA_Allele = gsub(x=TESLA$HLA_Allele,pattern="\\*",replacement = "")
#TESLA=TESLA %>% #add star and colon
# mutate(HLA_Allele = gsub('^(.{8})(.*)$', '\\1:\\2', gsub('^(.{5})(.*)$', '\\1*\\2', HLA_Allele)))
for(allele_i in 1:length(unique(TESLA$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(TESLA$HLA_Allele)[allele_i],pattern=":",replacement = "")
DATASET = TESLA %>% filter(HLA_Allele %in% unique(TESLA$HLA_Allele)[allele_i])
for(i in 1:length(unique(DATASET$Length))){
LENGTH = unique(DATASET$Length)[i]
testdata=paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(DATASET %>% filter(Length == LENGTH) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCstabpan-1.0/netMHCstabpan -p ",testdata," -a ",unique(TESLA$HLA_Allele)[allele_i]," -l ",LENGTH," -xls -xlsfile ", RESULTS_OUTPUT))
}
}
Read in binding stability rank scores from netMHCstabpan
TEST_DATA_LOCATION = "TESLA_STABPAN/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract HLA and clean
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$HLA_Allele,pattern="LENGTH_[0-9]*_",replacement = ""))
# map HLA nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(TESLA$HLA_Allele))
Netmhcpanres=Netmhcpanres %>% select(Peptide, "Thalf(h)",HLA_Allele, Rank) %>% dplyr::rename(StabRank=Rank)
TESLA=TESLA %>% select(!"Thalf(h)") %>% inner_join(Netmhcpanres, by=c("Peptide","HLA_Allele"))%>% distinct()
TESLA %>% nrow
## [1] 608
Prepare pathogenic dataset
Criteria
- Only one peptide,imm,hla observation
- Excluded any sequences composed of some non amino acid characters sometimes found in these datasets
- Filter for peptides of length 8-14 (same as TESLA).
- Do a little cleaning
- Filter for specific alleles i,.e those which stabpan can process from HLAs A,B,C
- After applying these filters there are peptides from a vast array of HLAs
# Seperate the rows for one row per Peptide, Immunogenicity, HLA observation
FullDataset=FullDataset %>% separate_rows(HLA_Allele, sep="\\|")
FullDataset=FullDataset %>% select(!c(MHCType,Sources))
FullDataset %>% select(Peptide, Immunogenicity, HLA_Allele) %>% dupes()
## No column names specified - using all columns.
## No duplicates detected.
## # A tibble: 0 × 4
## # … with 4 variables: Peptide <chr>, Immunogenicity <chr>, HLA_Allele <chr>,
## # n_copies <int>
# Perform some cleaning
FullDataset$HLA_Allele=gsub(FullDataset$HLA_Allele,pattern="\\*",replacement="")
# Exclude unusable peptides.
FullDataset=FullDataset %>% filter(!Peptide %in% grep(FullDataset$Peptide,pattern="X|l|-",value=T))
# This filter had already been applied, but double check no peptides originating from humans, i.e., no neoantigens.
FullDataset=FullDataset %>% filter(!Antigen_Organism %in% grep("Homo sapiens|cancer",Antigen_Organism,value=TRUE,ignore.case = T))
#only 8-14mers
FullDataset=FullDataset %>% filter(nchar(Peptide) %in% 8:14)
FullDataset$HLA_Allele = gsub(",",FullDataset$HLA_Allele,replacement = "")
FullDataset$Antigen_Name = gsub(",",FullDataset$Antigen_Name,replacement = "")
FullDataset$Antigen_Organism = gsub(",",FullDataset$Antigen_Organism,replacement = "")
FullDataset$Host_Organism= gsub(",",FullDataset$Host_Organism,replacement = "")
FullDataset=FullDataset %>% mutate(Length = nchar(Peptide))
# Filter for stabpan alleles.
ALLELES=fread("../../STABPAN_MHC_allele_names.txt",header=F)
FullDataset=FullDataset %>% filter(HLA_Allele %in% grep("HLA-A|HLA-B|HLA-C", HLA_Allele,value = T))
FullDataset=FullDataset %>% filter(HLA_Allele %in% ALLELES$V1)
FullDataset=FullDataset %>% distinct(Peptide,Immunogenicity,HLA_Allele,.keep_all = T)
FullDataset %>% select(Immunogenicity) %>% table
## .
## Negative Positive
## 26121 8530
FullDataset %>% select(Immunogenicity) %>% table%>% prop.table()
## .
## Negative Positive
## 0.7538311 0.2461689
FullDataset %>% nrow
## [1] 34651
Run netMHCpan 4.0 for BA rank/nM
- Generate nM and rank binding affinity scores
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_NETMHCPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
LENGTHS=FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}
Read in netMHCpan results
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_NETMHCPAN/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
FullDataset=FullDataset %>% inner_join(Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))) %>% mutate(Binder = ifelse(NB==1,"BINDER","NONBINDER"))
## Joining, by = c("Peptide", "HLA_Allele")
FullDataset %>% nrow
## [1] 34650
Filter only for binder to MHC for susbequent analysis
- Exclude observations which are predicted to be non-binders
FullDataset=FullDataset %>% filter(Binder == 'BINDER')
FullDataset %>% nrow
## [1] 23958
FullDataset %>% select(Immunogenicity) %>% table
## .
## Negative Positive
## 17664 6294
FullDataset %>% select(Immunogenicity) %>% table %>% prop.table()
## .
## Negative Positive
## 0.7372903 0.2627097
Run netmhc stabpan
- Generate a binding stability prediction
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_STABPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
DATASET = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
for(i in 1:length(unique(DATASET$Length))){
LENGTH = unique(DATASET$Length)[i]
testdata=paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
write.table(DATASET %>% filter(Length == LENGTH) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
system(paste0("/Applications/netMHCstabpan-1.0/netMHCstabpan -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",LENGTH," -xls -xlsfile ", RESULTS_OUTPUT))
}
}
Read in predictions from stab pan
- Read in rank and hrs stability score.
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_STABPAN/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract HLA and clean
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$HLA_Allele,pattern="LENGTH_[0-9]*_",replacement = ""))
# map HLA nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
Netmhcpanres=Netmhcpanres %>% select(Peptide, "Thalf(h)",HLA_Allele, Rank) %>% dplyr::rename(StabRank=Rank)
FullDataset=FullDataset %>% inner_join(Netmhcpanres, by=c("Peptide","HLA_Allele"))%>% distinct()
# 23958 obs so joining worked correctly.
FullDataset %>% nrow
## [1] 23958
Compute hydrophobic fraction
library(Peptides)
library(stringi)
HYDROPHOBIC_RESIDUES = c("V","I","L","F","M","W","C") # from BARNES 2003 (SEE WELLS PAPER)
FullDataset=FullDataset %>% mutate(HydrophobicCount = stri_count_regex(FullDataset$Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>% mutate(HydroFraction = HydrophobicCount/Length)
Finalise TESLA vs Pathogenic datasets for analysis
TESLA = TESLA %>% mutate(Dataset = "Cancer")
FullDataset = FullDataset%>% mutate(Dataset = "Pathogenic")
# Take necessary columns of data
TESLASubset=TESLA %>% select(Peptide,Immunogenicity, HLA_Allele, nM, "Thalf(h)", HydroFraction, Dataset, Rank, StabRank)
PathogenicSubset=FullDataset%>% select(Peptide,Immunogenicity, HLA_Allele, nM, "Thalf(h)", HydroFraction, Dataset, Rank,StabRank)
# Combine into one DT for analysis
combinedDataset = rbind(TESLASubset,PathogenicSubset)
# Look at number of observations
combinedDataset %>% select(Dataset,Immunogenicity)%>% table
## Immunogenicity
## Dataset Negative Positive
## Cancer 571 37
## Pathogenic 17664 6294
combinedDataset =combinedDataset%>% mutate(Dataset = factor(Dataset, levels = c("Pathogenic","Cancer")))
Results
Affinity comparison between TESLA vs pathogenic data
- Compare binding affinity in nM for TESLA vs Pathogenic peptides
- Generates Fig 2A
# Generate a density plot
BA_PATHTESLA_DENSITY_PLT= combinedDataset %>% ggplot(aes(x=nM, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Affinity\n(nM)")
mycomparisons =list(c("Positive","Negative"))
# Take the median of the dataset for TESLA-Nve and TESLA-Pve to plot dashed lines.
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(nM))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(nM))
# Generate a violin plot, comparing immunogenicity status for 1) TESLA and 2) Pathogenic peptides.
BA_PATHTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="nM",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Binding Affinity\n(nM)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
# Generate a boxplot
BA_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=nM, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(nM)")+theme_pubr(base_size = 16)
# ECDF plot. Split data first into four groups: pathogenic pve/nve, and cancer pve/nve.
BA_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(nM, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
TESLASubset %>% select(nM, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=nM, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Affinity\n(nM)")
Fig 2A
plot_grid(BA_PATHTESLA_DENSITY_PLT, BA_PATHTESLA_BOX_PLT+theme(legend.position = "none"),BA_PATHTESLA_VIOLIN_PLT, BA_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

Confirm results are consistent with for Rank score
- HLA ligands bind different MHC in different nM ranges.
- We cannot exmaine these effects in a HLA-specific manner, but we can use the rank score. According to the authors of netMHCpan: “This measure is not affected by inherent bias of certain molecules towards higher or lower mean predicted affinities.”
- Here, we observe the same pattern as with nM, but more mildly.
RANK_PATHTESLA_DENSITY_PLT= combinedDataset %>% ggplot(aes(x=Rank, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Affinity\n(Rank)")
mycomparisons =list(c("Positive","Negative"))
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(Rank))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(Rank))
RANK_PATHTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="Rank",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Binding Affinity\n(Rank)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
RANK_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=Rank, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(Rank)")+theme_pubr(base_size = 16)
RANK_PATHTESLA_ECDF_PLT=PathogenicSubset%>% select(Rank, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
TESLASubset%>% select(Rank, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=Rank, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Affinity\n(Rank)")
plot_grid(RANK_PATHTESLA_DENSITY_PLT, RANK_PATHTESLA_BOX_PLT+theme(legend.position = "none"),RANK_PATHTESLA_VIOLIN_PLT, RANK_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

Binding Stability
Hours
- Examine binding stability (hrs) between groups.
# Density plot. Comparing cancer vs pathogens, grouped by immunogenicity status.
STAB_PATHTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=`Thalf(h)`, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Stability\n(hrs)")
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(`Thalf(h)`))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(`Thalf(h)`))
# Violin plot. Comparing immunogenicity status for pathogens and also for cancer.
STAB_PATHTESLA_VIOLIN_PLT=combinedDataset %>%
ggviolin(x="Immunogenicity",y="Thalf(h)",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
# Boxplot.
STAB_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=`Thalf(h)`, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+theme_pubr(base_size = 16)
# ECDF
STAB_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(`Thalf(h)`, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
TESLASubset %>% select(`Thalf(h)`, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=`Thalf(h)`, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Stability\n(hrs)")
Fig 2B
library(cowplot)
plot_grid(STAB_PATHTESLA_DENSITY_PLT, STAB_PATHTESLA_BOX_PLT+theme(legend.position = "none"),STAB_PATHTESLA_VIOLIN_PLT, STAB_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

Rank score.
STAB_PATHTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=StabRank, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Stability\n(Rank)")
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(StabRank))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(StabRank))
STAB_PATHTESLA_VIOLIN_PLT=combinedDataset %>%
ggviolin(x="Immunogenicity",y="StabRank",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(Rank)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
STAB_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=StabRank, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(Rank)")+theme_pubr(base_size = 16)
STAB_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(StabRank, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
TESLASubset %>% select(StabRank, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=StabRank, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Stability\n(Rank)")
library(cowplot)
plot_grid(STAB_PATHTESLA_DENSITY_PLT, STAB_PATHTESLA_BOX_PLT+theme(legend.position = "none"),STAB_PATHTESLA_VIOLIN_PLT, STAB_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

Compare the fraction of hydrophobicity between groups
- Compare the fraction of hydrophobicity between groups
HYDRO_PATHTESLA_DENSITY_PLT= combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct() %>% ggplot(aes(x=HydroFraction, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+xlab("Fraction Hydrophobic")
MEDIAN_DATA_TESLA_NEG = combinedDataset%>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct() %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(HydroFraction))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(HydroFraction))
HYDRO_PATHTESLA_VIOLIN_PLT=combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% ggboxplot(x="Immunogenicity",y="HydroFraction")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center",comparisons = mycomparisons)+ylab("Fraction Hydrophobic")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
HYDRO_PATHTESLA_BOX_PLT=combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% ggplot(aes(x=Immunogenicity, y=HydroFraction, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic\n ")+theme_pubr(base_size = 16)
HYDRO_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(HydroFraction, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>% distinct()%>%rbind(
TESLASubset %>% select(HydroFraction, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")%>% distinct()
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=HydroFraction, color=Dataset))+stat_ecdf(size=2)+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Fraction Hydrophobic")
Fig 3C
plot_grid(HYDRO_PATHTESLA_DENSITY_PLT+rotate_x_text(angle=90), HYDRO_PATHTESLA_BOX_PLT+theme(legend.position = "none"),HYDRO_PATHTESLA_VIOLIN_PLT, HYDRO_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

Visualise these data
# Generate density plot
TCR_HYDRO_PATHTESLA_DENSITY_PLT= Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct() %>% ggplot(aes(x=TCR_HydroFraction, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+xlab("Fraction Hydrophobic (TCR Contact)")
MEDIAN_DATA_TESLA_NEG = Hydro_frac_tcr_contact_dt%>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct() %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(TCR_HydroFraction))
MEDIAN_DATA_TESLA_POS = Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(TCR_HydroFraction))
# Generate violin plot
TCR_HYDRO_PATHTESLA_VIOLIN_PLT=Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% ggboxplot(x="Immunogenicity",y="TCR_HydroFraction")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center",comparisons = mycomparisons)+ylab("Fraction Hydrophobic (TCR Contact)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
# Generate box plot
TCR_HYDRO_PATHTESLA_BOX_PLT=Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% ggplot(aes(x=Immunogenicity, y=TCR_HydroFraction, fill=Dataset))+
geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic (TCR Contact)")+theme_pubr(base_size = 16)
# Generate ECDF
TCR_HYDRO_PATHTESLA_ECDF_PLT=Hydro_frac_tcr_contact_dt %>% select(TCR_HydroFraction, Immunogenicity, Peptide, Dataset)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=TCR_HydroFraction, color=Dataset))+stat_ecdf(size=2)+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Fraction Hydrophobic (TCR Contact)")
plot_grid(TCR_HYDRO_PATHTESLA_DENSITY_PLT+rotate_x_text(angle=90), TCR_HYDRO_PATHTESLA_BOX_PLT+theme(legend.position = "none"),TCR_HYDRO_PATHTESLA_VIOLIN_PLT,TCR_HYDRO_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

HLA specific plots
- While due to low sample number of cancer peptides, we cannot compare pathogens vs cancer, but we can examine the pathogenic peptides in a HLA specific manner.
- Here, we filter for 5 common HLAs, and compare affinity in nM, stability in hours and fraction of hydrophobicity for immunogenic vs nonimmunogenic pathogenic peptides
Binding affinity nM
PathogenicSubset %>% filter(HLA_Allele %in% grep('A01:01|A02:01|B07:02|B40:01|C07:02',HLA_Allele,value = T))%>% select(HLA_Allele,Immunogenicity)%>% table
## Immunogenicity
## HLA_Allele Negative Positive
## HLA-A01:01 843 229
## HLA-A02:01 1624 1906
## HLA-B07:02 688 401
## HLA-B40:01 783 122
## HLA-C07:02 12 41
FullDataset %>% filter(HLA_Allele %in% grep('A01:01|A02:01|B07:02|B40:01|C07:02',HLA_Allele,value = T))%>% ggviolin(x="Immunogenicity",y="nM",add="boxplot")+facet_wrap(~HLA_Allele,nrow=3)+theme_pubr(base_size = 14)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity (nM)")

Binding stability hrs
FullDataset %>% filter(HLA_Allele %in% grep('A01:01|A02:01|B07:02|B40:01|C07:02',HLA_Allele,value = T))%>% ggviolin(x="Immunogenicity",y="Thalf(h)",add="boxplot")+facet_wrap(~HLA_Allele,nrow=3)+theme_pubr(base_size = 14)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability (hrs)")

Fraction of hydrophobicity for the full length peptide.
FullDataset %>% filter(HLA_Allele %in% grep('A01:01|A02:01|B07:02|B40:01|C07:02',HLA_Allele,value = T))%>% ggboxplot(x="Immunogenicity",y="HydroFraction")+theme_pubr(base_size = 14)+facet_wrap(~HLA_Allele,nrow=3)+stat_compare_means(label = "p.signif",label.x.npc = "center", label.y = 0.9)+ylab("Fraction Hydrophobic")+ylim(0,1.0)
